This notebook illustrates using clustering and our descriptors compounds.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.offline as pyo
import plotly.graph_objs as go
pyo.init_notebook_mode()


from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
from yellowbrick.cluster import KElbowVisualizer



import warnings
warnings.filterwarnings("ignore")
In [2]:
# import data
data = pd.read_csv("../../../data/processed/drug_bank_clean.csv")
In [3]:
data.head(2)
Out[3]:
Name SMILES ALogP ALogp2 AMR apol nAcid naAromAtom nAromBond nAtom ... MW WTPT-1 WTPT-2 WTPT-3 WTPT-4 WTPT-5 WPATH WPOL XLogP Zagreb
0 Bivalirudin CC[C@H](C)[C@H](NC(=O)[C@H](CCC(O)=O)NC(=O)[C@... -5.0931 25.939668 534.3813 317.363434 6 18 18 293 ... 2180.289181 306.516036 1.977523 153.54308 82.176355 71.366725 286334.0 230.0 -8.623 752.0
1 Leuprolide CCNC(=O)[C@@H]1CCCN1C(=O)[C@H](CCCNC(N)=N)NC(=... 0.4325 0.187056 317.0896 187.074612 0 20 21 171 ... 1209.400552 174.200943 2.002310 78.03037 30.548034 47.482336 46759.0 130.0 1.134 436.0

2 rows × 224 columns

In [4]:
# getting all descriptors except compound name
without_compound = data.drop(['Name','SMILES'], axis=1)
without_compound.head(2)
Out[4]:
ALogP ALogp2 AMR apol nAcid naAromAtom nAromBond nAtom ATSc1 ATSc2 ... MW WTPT-1 WTPT-2 WTPT-3 WTPT-4 WTPT-5 WPATH WPOL XLogP Zagreb
0 -5.0931 25.939668 534.3813 317.363434 6 18 18 293 5.125258 -2.388694 ... 2180.289181 306.516036 1.977523 153.54308 82.176355 71.366725 286334.0 230.0 -8.623 752.0
1 0.4325 0.187056 317.0896 187.074612 0 20 21 171 2.269431 -1.093604 ... 1209.400552 174.200943 2.002310 78.03037 30.548034 47.482336 46759.0 130.0 1.134 436.0

2 rows × 222 columns

In [5]:
# scaling our data by using standard scaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(without_compound)
In [6]:
# Instantiate the KMeans model
kmeans_model = KMeans()

# Instantiate the KElbowVisualizer
elbow_visualizer = KElbowVisualizer(kmeans_model, k=(2, 10))

# Fit the visualizer to the data
elbow_visualizer.fit(X_scaled)

# Visualize the elbow plot
elbow_visualizer.show()
Out[6]:
<AxesSubplot:title={'center':'Distortion Score Elbow for KMeans Clustering'}, xlabel='k', ylabel='distortion score'>
In [7]:
sns.set()
In [8]:
# elbow method
n_clusters = 20
cost = []
for i in range(2, n_clusters):
    kmean= KMeans(i, n_init=10, max_iter=1000)
    kmean.fit(X_scaled)
    labels = kmean.labels_
    
    cost.append(kmean.inertia_)
    print("Cluster {} Sillhoute Score {}".format(i, silhouette_score(without_compound, labels)))  

plt.title("Elbow Method")
plt.plot(cost, 'bx-');
Cluster 2 Sillhoute Score 0.5834159976927242
Cluster 3 Sillhoute Score 0.4588847380871933
Cluster 4 Sillhoute Score 0.32065967865045736
Cluster 5 Sillhoute Score 0.3372377183172219
Cluster 6 Sillhoute Score 0.2430986558790337
Cluster 7 Sillhoute Score 0.21382251489820583
Cluster 8 Sillhoute Score 0.20660917772860202
Cluster 9 Sillhoute Score 0.1782988418325326
Cluster 10 Sillhoute Score 0.0992579974286342
Cluster 11 Sillhoute Score 0.13565552234357864
Cluster 12 Sillhoute Score 0.09488133302010804
Cluster 13 Sillhoute Score 0.0596659543420617
Cluster 14 Sillhoute Score 0.059722622959081234
Cluster 15 Sillhoute Score 0.08037753431974347
Cluster 16 Sillhoute Score 0.012488486107317178
Cluster 17 Sillhoute Score 0.04471580376897327
Cluster 18 Sillhoute Score 0.002427637283397599
Cluster 19 Sillhoute Score 0.022726472743283842
In [9]:
kmean = KMeans(4, n_init=10, max_iter=1000)
kmean.fit(X_scaled)
labels = kmean.labels_
In [10]:
# adding clusters to our data
clusters = pd.concat([without_compound, pd.DataFrame({'cluster': labels})], axis=1)
clusters.head()
Out[10]:
ALogP ALogp2 AMR apol nAcid naAromAtom nAromBond nAtom ATSc1 ATSc2 ... WTPT-1 WTPT-2 WTPT-3 WTPT-4 WTPT-5 WPATH WPOL XLogP Zagreb cluster
0 -5.0931 25.939668 534.3813 317.363434 6 18 18 293 5.125258 -2.388694 ... 306.516036 1.977523 153.543080 82.176355 71.366725 286334.0 230.0 -8.623 752.0 2
1 0.4325 0.187056 317.0896 187.074612 0 20 21 171 2.269431 -1.093604 ... 174.200943 2.002310 78.030370 30.548034 47.482336 46759.0 130.0 1.134 436.0 2
2 -1.8743 3.513000 325.6625 190.878612 0 20 21 175 2.635543 -1.251529 ... 181.809125 1.997902 88.729976 35.878826 52.851150 52357.0 134.0 -2.012 458.0 2
3 9.1843 84.351366 493.8733 292.709055 0 36 40 266 3.021552 -1.418733 ... 261.668224 1.997467 98.311965 40.550947 57.761019 148212.0 208.0 8.583 660.0 2
4 -2.8933 8.371185 269.4076 154.458752 0 12 12 138 2.148910 -0.961833 ... 147.352827 1.991254 75.875569 30.211599 39.611320 28498.0 110.0 -1.437 360.0 2

5 rows × 223 columns

In [11]:
silhouette_score(without_compound, labels)
Out[11]:
0.3227968445948065

All features¶

In [12]:
# using pca with 2 components to visualize our scaled data with clusters
components = 2

# initilizing and fitting the pca
pca = PCA(n_components=components)
X_pca = pca.fit_transform(X_scaled)

# making dataframe for our pca components
pca_df = pd.DataFrame(X_pca, columns=['PCA1','PCA2'])

# adding cluster assignment
pca_df['cluster'] = pd.Categorical(kmean.labels_)

# adding compound name to this dataframe
# pca_df['compound'] = data['Name']

# get the index of the most important feature on EACH component i.e. largest absolute value
most_important = [np.abs(pca.components_[i]).argmax() for i in range(components)]

# columns/ features of our data
initial_feature_names = without_compound.columns

# get the names
most_important_names = [initial_feature_names[most_important[i]] for i in range(components)]

# using LIST COMPREHENSION HERE AGAIN
dic = {'PC{}'.format(i + 1): most_important_names[i] for i in range(components)}

# explained variance
print("Explained variance ratio", pca.explained_variance_ratio_)
print("\nnSum of Explained variance ratio", pca.explained_variance_ratio_.sum())
print("\nColumns chosen by PCA", dic)


pca_df



# Adjust the figure size
plt.figure(figsize=(12, 8))
# Create the scatterplot using Seaborn
sns.scatterplot(data=pca_df, x='PCA1', y='PCA2', hue="cluster")

# Set the plot title
plt.title("Compounds Clustering - All Features")


# Show the plot
plt.show()
# fig = px.scatter(pca_df, x='PCA1', y='PCA2', color="cluster",  title="Compounds Clustering - All Features", width=800)

# fig.show()
# saving the figure in html file
# fig.write_html("cluster4.html")
Explained variance ratio [0.34087863 0.09107553]

nSum of Explained variance ratio 0.4319541614805409

Columns chosen by PCA {'PC1': 'Zagreb', 'PC2': 'khs.ssssC'}
In [13]:
for c in clusters:
    grid = sns.FacetGrid(clusters, col="cluster")
    grid.map(plt.hist, c)
In [17]:
for col in clusters.columns:
    if col == "cluster":
        pass
    sns.boxplot(data=clusters, x='cluster', y=col)
    plt.title("{} by Class".format(col))
    plt.show()

99% of Components¶

In [18]:
# using pca with 2 components to visualize our scaled data with clusters
components = 0.99

# initilizing and fitting the pca
pca = PCA(n_components=components)
X_pca = pca.fit_transform(X_scaled)
pca.n_components_
Out[18]:
82
In [19]:
kmean = KMeans(4, n_init=10, max_iter=1000)
kmean.fit(X_pca)
labels = kmean.labels_
In [20]:
silhouette_score(without_compound, labels)
Out[20]:
0.32058828761391345
In [21]:
# using pca with 2 components to visualize our scaled data with clusters
components = 2

# initilizing and fitting the pca
pca = PCA(n_components=components)
X_pca = pca.fit_transform(X_pca)

# making dataframe for our pca components
pca_df = pd.DataFrame(X_pca, columns=['PCA1','PCA2'])

# adding cluster assignment
pca_df['cluster'] = pd.Categorical(kmean.labels_)

# adding compound name to this dataframe
# pca_df['compound'] = data['Name']

# get the index of the most important feature on EACH component i.e. largest absolute value
most_important = [np.abs(pca.components_[i]).argmax() for i in range(components)]

# columns/ features of our data
initial_feature_names = without_compound.columns

# get the names
most_important_names = [initial_feature_names[most_important[i]] for i in range(components)]

# using LIST COMPREHENSION HERE AGAIN
dic = {'PC{}'.format(i + 1): most_important_names[i] for i in range(components)}

# explained variance
print("Explained variance ratio", pca.explained_variance_ratio_)
print("\nnSum of Explained variance ratio", pca.explained_variance_ratio_.sum())
print("\nColumns chosen by PCA", dic)


pca_df
# Adjust the figure size
plt.figure(figsize=(12, 8))
# Create the scatterplot using Seaborn
sns.scatterplot(data=pca_df, x='PCA1', y='PCA2', hue="cluster")

# Set the plot title
plt.title("Compounds Clustering - All Features")


# fig = px.scatter(pca_df, x='PCA1', y='PCA2', color="cluster", title="Compounds Clustering - 99% Components PCA", width=800)

# # saving the figure in html file
# # fig.write_html("cluster4.html")

# fig.show()
Explained variance ratio [0.34426944 0.09198149]

nSum of Explained variance ratio 0.4362509268238107

Columns chosen by PCA {'PC1': 'ALogP', 'PC2': 'ALogp2'}
Out[21]:
Text(0.5, 1.0, 'Compounds Clustering - All Features')
In [22]:
# adding clusters to our data
clusters = pd.concat([without_compound, pd.DataFrame({'cluster': labels})], axis=1)
clusters.head(3)
Out[22]:
ALogP ALogp2 AMR apol nAcid naAromAtom nAromBond nAtom ATSc1 ATSc2 ... WTPT-1 WTPT-2 WTPT-3 WTPT-4 WTPT-5 WPATH WPOL XLogP Zagreb cluster
0 -5.0931 25.939668 534.3813 317.363434 6 18 18 293 5.125258 -2.388694 ... 306.516036 1.977523 153.543080 82.176355 71.366725 286334.0 230.0 -8.623 752.0 3
1 0.4325 0.187056 317.0896 187.074612 0 20 21 171 2.269431 -1.093604 ... 174.200943 2.002310 78.030370 30.548034 47.482336 46759.0 130.0 1.134 436.0 3
2 -1.8743 3.513000 325.6625 190.878612 0 20 21 175 2.635543 -1.251529 ... 181.809125 1.997902 88.729976 35.878826 52.851150 52357.0 134.0 -2.012 458.0 3

3 rows × 223 columns

In [23]:
for c in clusters:
    grid = sns.FacetGrid(clusters, col="cluster")
    grid.map(plt.hist, c)
In [24]:
for col in clusters.columns:
    if col == "cluster":
        pass
    sns.boxplot(data=clusters, x='cluster', y=col)
    plt.title("{} by Class".format(col))
    plt.show()

2 Components¶

In [25]:
# using pca with 2 components to visualize our scaled data with clusters
components = 2

# initilizing and fitting the pca
pca = PCA(n_components=components)
X_pca = pca.fit_transform(X_scaled)
pca.n_components_
Out[25]:
2
In [26]:
kmean = KMeans(4, n_init=10, max_iter=1000)
kmean.fit(X_pca)
labels = kmean.labels_
In [27]:
silhouette_score(without_compound, labels)
Out[27]:
0.3388544836501914
In [28]:
# using pca with 2 components to visualize our scaled data with clusters
components = 2

# initilizing and fitting the pca
pca = PCA(n_components=components)
X_pca = pca.fit_transform(X_pca)

# making dataframe for our pca components
pca_df = pd.DataFrame(X_pca, columns=['PCA1','PCA2'])

# adding cluster assignment
pca_df['cluster'] = pd.Categorical(kmean.labels_)

# adding compound name to this dataframe
# pca_df['compound'] = data['Name']

# get the index of the most important feature on EACH component i.e. largest absolute value
most_important = [np.abs(pca.components_[i]).argmax() for i in range(components)]

# columns/ features of our data
initial_feature_names = without_compound.columns

# get the names
most_important_names = [initial_feature_names[most_important[i]] for i in range(components)]

# using LIST COMPREHENSION HERE AGAIN
dic = {'PC{}'.format(i + 1): most_important_names[i] for i in range(components)}

# explained variance
print("Explained variance ratio", pca.explained_variance_ratio_)
print("\nnSum of Explained variance ratio", pca.explained_variance_ratio_.sum())
print("\nColumns chosen by PCA", dic)


pca_df

# Adjust the figure size
plt.figure(figsize=(12, 8))
# Create the scatterplot using Seaborn
sns.scatterplot(data=pca_df, x='PCA1', y='PCA2', hue="cluster")

# Set the plot title
plt.title("Compounds Clustering - All Features")

# fig = px.scatter(pca_df, x='PCA1', y='PCA2', color="cluster", title="Compounds Clustering - 2 Components PCA", width=800)

# # saving the figure in html file
# # fig.write_html("cluster4.html")

# fig.show()
Explained variance ratio [0.78915463 0.21084537]

nSum of Explained variance ratio 1.0

Columns chosen by PCA {'PC1': 'ALogP', 'PC2': 'ALogp2'}
Out[28]:
Text(0.5, 1.0, 'Compounds Clustering - All Features')
In [29]:
# adding clusters to our data
clusters = pd.concat([without_compound, pd.DataFrame({'cluster': labels})], axis=1)
clusters.head(3)
Out[29]:
ALogP ALogp2 AMR apol nAcid naAromAtom nAromBond nAtom ATSc1 ATSc2 ... WTPT-1 WTPT-2 WTPT-3 WTPT-4 WTPT-5 WPATH WPOL XLogP Zagreb cluster
0 -5.0931 25.939668 534.3813 317.363434 6 18 18 293 5.125258 -2.388694 ... 306.516036 1.977523 153.543080 82.176355 71.366725 286334.0 230.0 -8.623 752.0 1
1 0.4325 0.187056 317.0896 187.074612 0 20 21 171 2.269431 -1.093604 ... 174.200943 2.002310 78.030370 30.548034 47.482336 46759.0 130.0 1.134 436.0 1
2 -1.8743 3.513000 325.6625 190.878612 0 20 21 175 2.635543 -1.251529 ... 181.809125 1.997902 88.729976 35.878826 52.851150 52357.0 134.0 -2.012 458.0 1

3 rows × 223 columns

In [30]:
for c in clusters:
    grid = sns.FacetGrid(clusters, col="cluster")
    grid.map(plt.hist, c)
In [31]:
for col in clusters.columns:
    if col == "cluster":
        pass
    sns.boxplot(data=clusters, x='cluster', y=col)
    plt.title("{} by Class".format(col))
    plt.show()

Interpretation¶

In [ ]:
# for c in clusters:jupyter nbconvert --execute --to html clustering.ipynb

#     grid = sns.FacetGrid(clusters, col="cluster")
#     grid.map(sns.boxplot, c)